Before executing this notebook, check ./DerivedData for:
survivalData_tabulated.csvThe working directory should be ./Code
See also : ./Code/term-data-dictionary.html for data dictionary and commonly used abbreviations.
rm( list = ls() )
require(dplyr)
require(knitr)
require(kableExtra)
require(reshape2)
require(ggplot2)
require( VIM )
require(mice)
require( stringr )
options(knitr.kable.NA = '')
data.path <- "../DerivedData/"
tabSurv <- read.csv( paste0( data.path, "survivalData_tabulated.csv") )
startTime <- Sys.time()
This section produces the diagram shown in Figure 1 (Participant Flow) of the paper.
# -- total participants retrieved / harvested from CATIE data
total.N <- unique( tabSurv$ID )
# -- those missing data which means could not prospectively evaluate for TRS (so status.TRS == NA )
missing.BL <- tabSurv$ID[ which( is.na( tabSurv$status.TRS ) ) ]
missing.cases <- tabSurv[ which( is.na( tabSurv$status.TRS ) ), ]
# -- keep a copy of the whole of tabSurv (before removing cases that could not be included)
tabSurv.bak <- tabSurv
# -- remove missing.BL cases from tabSurv so we can continue defining sub-groups
tabSurv <- tabSurv[ -which( tabSurv$ID %in% missing.BL ), ]
# -- left to analyse
N.included <- nrow( tabSurv ) # total participants that we can analyse
# -- indices of patients who "trigger" the absolute thresholds at some point
AT.cases <- tabSurv[ which( !is.na( tabSurv$time.onset.TRS ) ), ]
# -- indices of patients who *never* trigger the absolute threshold
NeverAT.cases <- tabSurv[ which( is.na( tabSurv$time.onset.TRS ) ), ]
# -- of those who trigger (AT.cases), those that went on to develop TRS by full criteria
TRS.cases <- AT.cases[ which( AT.cases$status.TRS == 2), ]
# -- those in the AT group, but who were shown to be Not-TRS (for one of many reasons - see below)
NTR.cases <- AT.cases[ which( AT.cases$status.TRS == 1), ]
# -- Of those NTR cases, break down by :
# a) those who were still symptomatic, but didn't have >= 2 adequate trials
# - this group will be people "on the way" to TRS if they complete a second adequate Rx and show no response
# -- those in the AT group, who remained symptomatic, but only had 1 adequate Rx
# -- so are effectively right censored by exiting trial
oneRx.cases <- AT.cases[ which(
!is.na( AT.cases$time.onset.TRS ) &
AT.cases$numAdeq == 1 & # had one adeq Rx
AT.cases$status.sof == 1 & # but SOF was above threshold
AT.cases$status.sx == 1 # and symptoms persisted at TRS threshold
),
]
# -- those in the AT group, who remained symptomatic, but only had ZERO adequate Rx
# -- so are effectively right censored by exiting trial
zeroRx.cases <- AT.cases[ which(
!is.na( AT.cases$time.onset.TRS ) &
AT.cases$numAdeq == 0 & # had one adeq Rx
AT.cases$status.sof == 1 & # but SOF was above threshold
AT.cases$status.sx == 1 # and symptoms persisted at TRS threshold
),
]
library(DiagrammeR)
# Create a node data frame
nodes <-
create_node_df(n = 10,
label = c( paste("Retrieved =", length( total.N ) ), #1
paste("Excluded \nfor Missing\nData =", length(missing.BL) ), #2
paste("Included =", N.included ), #3
paste("NAT\nNever Above \nThreshold =", nrow( NeverAT.cases ) ), #4
paste("AT\nAbove\nThreshold =", nrow( AT.cases ) ), #5
paste("Not TRS =\n", nrow( NTR.cases ) ), #6
paste("Develop \n TRS =", nrow( TRS.cases ) ), #7
paste("Responded \n =",
nrow( NTR.cases ) - ( nrow( oneRx.cases ) +
nrow( zeroRx.cases) )), #8
paste("Symptomatic \n Only 1 \nAdeq Rx =",
nrow( oneRx.cases ) ), #9
paste("Symptomatic \n But 0 \nAdeq Rx =",
nrow( zeroRx.cases ) ) #10
),
type = "lower",
style = "filled",
fontsize = "8",
fixedsize = FALSE,
fontcolor = c("black"),
#fillcolor = c(rep("white",5),"PaleGreen","LightCoral","OrangeRed"),
fillcolor = c(rep("white",3),"#a1d99b","#ffeda0","#ffeda0","#fb6a4a","#a1d99b","#fee6ce","#fdae6b"),
shape = c("rectangle"))
edges <-
create_edge_df(from = c(1, 1, 3, 3, 5, 5, 6, 6, 6),
to = c(2, 3, 4, 5, 6, 7, 8, 9, 10),
rel = "leading_to")
graph <-
create_graph(
nodes_df = nodes,
edges_df = edges)
render_graph(graph, layout = "tree")
This graph describes:
Not TRS – the sub-population of the AT group who did not meet all the TRRIP criteria. In the “Not TRS” group:
This section produces the diagram for Figure 2 (see also: Supplementary Information Figure S2). Note that the diagram required composition and editing because of the limitations of displaying all the relevant figures on each section/intersection of the venn diagram.
The individual diagrams are written to (best viewed from) ./DerivedData
require(venneuler)
## Loading required package: venneuler
## Loading required package: rJava
# -- all IDs included in analyses (N = 1334)
included.IDs <- tabSurv$ID
N.total <- nrow( tabSurv )
# -- to do this, we'll need to manually inspect the raw trajectory data
attach( paste0( data.path, "raw_trajectories.RData" ) )
## The following object is masked _by_ .GlobalEnv:
##
## data.path
# -- number of people who at some point hit the SOF threshold and are in the included set of participants
temp.SOFAS <- TRS.sofas.traj[ which( TRS.sofas.traj$ID %in% included.IDs & TRS.sofas.traj$TRS.sofas.traj == 1 ), ]
ids.SOF <- unique( temp.SOFAS$src_subject_id )
count.SOF <- length( ids.SOF )
tab.SOF <- data.frame( Group = c("Moderate \n Or Above","Below \nModerate"),
Freq = c( count.SOF, N.total - count.SOF ))
# -- highlight moderate or above trials bar
bar.colors <- c("0" = "#8da0cb", "1" = "#fc8d62")
tab.SOF$colCode <- c(1,0)
ggplot(tab.SOF, aes(x = Group, y = Freq, label = Freq, fill = factor(colCode) ) ) +
geom_bar(stat = "identity") +
geom_text(size = 10, vjust = -0.6 ) +
scale_fill_manual(values=bar.colors) +
xlab("\nSocial And \n Occupational Impairment" ) +
ylab("") +
ylim(0, 1700) +
theme_classic(base_size = 30) +
theme( legend.position = "none") +
theme(axis.line=element_blank(),
#axis.text.x=element_blank(),
axis.text.x = element_text(face="bold", colour = "black", size=20 ),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
#axis.title.x=element_blank(),
#axis.title.y=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
ggsave( paste0( data.path, "bar_SOF.png" ), bg = "transparent" )
## Saving 7 x 5 in image
# -- breakdown of adequate trials / number >= 2 adequate trials
temp.Rx <- TRS.rx.traj[ which( TRS.rx.traj$ID %in% included.IDs ), ]
tab.adeqRx <- data.frame( AdeqRx = seq(0, max( temp.Rx$cumAdeqRx ) ),
Freq = rep(0, max( temp.Rx$cumAdeqRx ) + 1 ) )
unique.ids <- unique( temp.Rx$ID )
ids.2Rx <- c()
for( i in 1:length( unique.ids ) ) {
this.adeq.rx <- max( temp.Rx$cumAdeqRx[ which( temp.Rx$ID == unique.ids[i] ) ] )
tab.adeqRx$Freq[ which( tab.adeqRx$AdeqRx == this.adeq.rx ) ] <- tab.adeqRx$Freq[ which( tab.adeqRx$AdeqRx == this.adeq.rx ) ] + 1
if( this.adeq.rx >= 2 ) {
ids.2Rx <- c( ids.2Rx, unique.ids[i] )
}
}
count.Rx <- length( ids.2Rx )
# -- highlight 2 adequate trials bar
bar.colors <- c("0" = "#8da0cb", "1" = "#fc8d62")
tab.adeqRx$colCode <- c(0,0,1,1)
ggplot(tab.adeqRx, aes(x = AdeqRx, y = Freq, label = Freq, fill = factor(colCode))) +
geom_bar(stat = "identity") +
geom_text(size = 10, vjust = -0.6 ) +
scale_fill_manual(values=bar.colors) +
xlab("\nNumber of Adequate Trials" ) +
ylab("") +
ylim(0, 800) +
theme_classic(base_size = 30) +
theme( legend.position = "none") +
theme(axis.line=element_blank(),
#axis.text.x=element_blank(),
axis.text.x = element_text(face="bold", colour = "black", size=20 ),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
#axis.title.x=element_blank(),
#axis.title.y=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
ggsave( paste0( data.path, "bar_AdeqRx.png" ), bg = "transparent" )
## Saving 7 x 5 in image
# -- Absolute Sx thresholds
# -- breakdown of Sx criteria : at least moderate severity across domains, *ever* in trial (not response)
temp.Sx <- TRS.sx.traj[ which( TRS.sx.traj$ID %in% included.IDs ), ]
tab.Sx <- data.frame( Domain = c("P", "N", "G"),
Freq = rep(0, 3) )
unique.ids <- unique( temp.Sx$ID )
ids.Sx <- c()
for( i in 1:length( unique.ids ) ) {
this.sx <- ifelse( colSums( temp.Sx[ temp.Sx$ID == unique.ids[i] , c("TRS.pos", "TRS.neg", "TRS.gen") ] ) > 0, 1, 0 )
# -- increment frequency counts
tab.Sx$Freq <- as.numeric( tab.Sx$Freq + this.sx )
if( any( this.sx > 0 ) ) {
ids.Sx <- c( ids.Sx, unique.ids[i] )
}
}
# # -- compute proportions from included population (N = 1334) who at some point met P, N or G criteria
# tab.Sx$Freq <- tab.Sx$Freq / nrow( tabSurv )
count.Sx <- length( ids.Sx )
tab.Sx$Domain <- factor( tab.Sx$Domain, levels = c("P","N","G"), labels = c("P","N","G") )
ggplot(tab.Sx, aes(x = Domain, y = Freq, label = Freq) ) +
geom_bar(stat = "identity", fill = "#8da0cb") +
geom_text(size = 10, vjust = -0.6 ) +
xlab("\nPANSS Domain" ) +
ylab("") +
ylim(0, 1400) +
theme_classic(base_size = 30) +
theme( legend.position = "none") +
theme(axis.line=element_blank(),
axis.text.x = element_text(face="bold", colour = "black", size=20 ),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
ggsave( paste0( data.path, "bar_Sx.png" ), bg = "transparent" )
## Saving 7 x 5 in image
vd <- c( SOF = count.SOF / N.total,
Sx = count.Sx / N.total,
Rx = count.Rx / N.total,
"SOF&Sx" = length(intersect(ids.SOF, ids.Sx) ) / N.total,
"SOF&Rx" = length(intersect(ids.SOF, ids.2Rx) ) / N.total,
"Sx&Rx" = length(intersect(ids.Sx, ids.2Rx) ) / N.total,
"SOF&Sx&Rx" = length( intersect( ids.SOF, intersect(ids.Sx, ids.2Rx) ) ) / N.total
)
vdd <- venneuler(vd)
png(paste0(data.path,"venn_labels.png"))
plot( vdd )
dev.off()
## png
## 2
For the 110 the relevant missing information was:
colSums( missing.cases[ ,c("missing.Sx","missing.Rx","missing.SOF") ] )
## missing.Sx missing.Rx missing.SOF
## 107 96 1
This code relates to the Results section of the paper, specifically Table 2, were we ask if the 110 excluded participants differ from the 1334 participants who could be included.
We used the same demographic and baseline data previously reported in the original CATIE trial paper:
Lieberman, J. A., Stroup, T. S., McEvoy, J. P., Swartz, M. S., Rosenheck, R. A., Perkins, D. O., … Hsiao, J. K. (2005). Effectiveness of Antipsychotic Drugs in Patients with Chronic Schizophrenia. New England Journal of Medicine, 353(12), 1209–1223. http://doi.org/10.1056/NEJMoa051688
However, some of the total 1444 participants are missing baseline and demographic data. We first begin by looking at the pattern of missing baseline data (reported as the footnote to Table 2), then proceed to imputation to enable analysis of differences between excluded and included participants.
# subset the complete data set (tabSurv) so we are testing only relevant variables
# -- this means we exclude all derived TRS related variables, and include only those reported in
# -- Lieberman et al (2005)
excludeCols <- c("ID", "time.inTrial", "onset.Sx", "onset.SOF", "onset.Rx", "time.onset.TRS",
"time.TRS", "numAdeq", "durAdeq", "totalRx", "status.rx", "status.sx", "status.sof",
"status.TRS", "TRS.pos", "TRS.neg", "missing.Sx", "missing.Rx", "missing.SOF",
"time.inTrialYrs", "time.onset.TRSYrs","time.TRSYrs" )
miss.plot <- aggr(tabSurv.bak[ , -which( names( tabSurv.bak ) %in% excludeCols ) ], col=c('navyblue','red'),
numbers=TRUE,
sortVars=TRUE,
labels=names(data),
cex.axis=.7, gap=3,
ylab=c("Histogram of missing data","Pattern")
)
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## yrsFirstTx 0.0263157895
## yrsFrstAntiPsyRx 0.0228531856
## employFT 0.0069252078
## yrsEduc 0.0055401662
## CGIsev 0.0034626039
## N 0.0013850416
## P 0.0006925208
## G 0.0006925208
## age 0.0000000000
## sex 0.0000000000
## exac3mo 0.0000000000
## olzB0 0.0000000000
## quetB0 0.0000000000
## rispB0 0.0000000000
## zipB0 0.0000000000
## halB0 0.0000000000
## decaB0 0.0000000000
## perB0 0.0000000000
## otherB0 0.0000000000
## race 0.0000000000
## marital 0.0000000000
## COPD 0.0000000000
## DM 0.0000000000
## HepABC 0.0000000000
## Lipid 0.0000000000
## HTN 0.0000000000
## IHD 0.0000000000
## OsteoArth 0.0000000000
## Osteopor 0.0000000000
## STI 0.0000000000
## Depression 0.0000000000
## alcDep_5yrs 0.0000000000
## alcAbuse_5yrs 0.0000000000
## drugDep_5yrs 0.0000000000
## drugAbuse_5yrs 0.0000000000
## OCD_5yrs 0.0000000000
## anxDis_5yrs 0.0000000000
This analysis shows that missing data is confined to:
yrsFirstTx (integer)yrsFirstAntiPsyRx (integer)employFT (binary)CGIsev (integer)yrsEduc (integer)P, negative N and general G (all integer) PANSSAlthough in very small numbers :
kable( miss.plot$missings[ which( miss.plot$missings$Count > 0), ], "html" ) %>%
kable_styling("striped", full_width = F)
| Variable | Count | |
|---|---|---|
| P | P | 1 |
| N | N | 2 |
| G | G | 1 |
| yrsEduc | yrsEduc | 8 |
| CGIsev | CGIsev | 5 |
| employFT | employFT | 10 |
| yrsFirstTx | yrsFirstTx | 38 |
| yrsFrstAntiPsyRx | yrsFrstAntiPsyRx | 33 |
totalData <- dim( tabSurv.bak[ , -which( names( tabSurv.bak ) %in% excludeCols ) ] )
missPerc <- 100 * sum( miss.plot$missings$Count ) / ( totalData[1] * totalData[2] )
# -- latex table
ltxTable <- miss.plot$missings[ which( miss.plot$missings$Count > 0), ]
rownames( ltxTable ) <- c()
ltxTable$percMiss <- round( 100 * ( ltxTable$Count / totalData[1] ), 2 )
colnames( ltxTable ) <- c("Variable", "Number", "Percentage")
ltxTable$Variable <- c("PANSS, P", "PANSS, N", "PANSS, G",
"Years Education", "CGI (severity)",
"Employed","Years First Treated (Emotional/Behavioural Problem)",
"Years First Treated (Antipsychotic)")
kable( ltxTable , "latex", booktabs = TRUE ) %>%
kable_styling(font_size = 8) %>%
add_header_above(c(" " = 1, "Missing Data" = 2))
In total, we have 0.183% missing data.
We will attempt multiple imputation with chained equations for the missing variables. Experiments revealed that for integer variables, predictive mean matching worked better than regression-based methods. Exceptions were yrsFirstTx and yrsFrstAntiPsyRx where random sampling from non-missing data provided more consistent imputations. For employFT (binary) logistic regression performed well.
tabSurv$missingFlag <- 0
missing.cases$missingFlag <- 1
to.impute.tab <- rbind( tabSurv, missing.cases )
to.impute.tab$employFT <- factor( tabSurv.bak$employFT )
# -- Add in some variables we DO NOT want imputed, but need to be carried in the imputed sets
# for analysis later, including times to events
# 1) -- final "yes"/"no" flag for TRS case ( == 1, or 0 == not)
to.impute.tab$final.TRS <- ifelse( to.impute.tab$status.TRS == 2, 1, 0 )
# 2) -- time to TRS (for a right censored model : If final.TRS == 1, event occurs at time.TRSYrs, else time.inTrialYrs)
# This will be the event time in a standard cox regression
to.impute.tab$eventTime.TRS <- to.impute.tab$time.inTrialYrs * ifelse( to.impute.tab$final.TRS == 0, 1, 0 ) + ## capture NON TRS times
to.impute.tab$final.TRS * ifelse( is.na( to.impute.tab$time.TRSYrs ), 0, to.impute.tab$time.TRSYrs ) ## capture TRS cases
# 3) -- Add a flag to denote participant is in the AT group
to.impute.tab$group.AT <- ifelse( to.impute.tab$ID %in% AT.cases$ID, 1, 0 )
excludeCols <- c("ID", "time.inTrial", "onset.Sx", "onset.SOF", "onset.Rx", "time.onset.TRS",
"time.TRS", "numAdeq", "durAdeq", "totalRx", "status.rx", "status.sx", "status.sof",
"status.TRS", "TRS.pos", "TRS.neg", "missing.Sx", "missing.Rx", "missing.SOF",
"time.inTrialYrs", "time.onset.TRSYrs","time.TRSYrs", "missingFlag", "final.TRS",
"eventTime.TRS", "group.AT")
# -- first, get a predictor matrix so we can remove variables from being used as predictors for imputation
init.mice <- mice( to.impute.tab, maxit = 0, print = FALSE )
## Warning: Number of logged events: 6
preds <- init.mice$predictorMatrix
meths <- init.mice$method
# -- ensure we remove excludeCols
preds[ , excludeCols ] <- 0
preds[ excludeCols, ] <- 0
# -- from testing, we know that the following imputation methods work better
meths[excludeCols] <- ""
meths["yrsFirstTx"] <- "sample"
meths["yrsFrstAntiPsyRx"] <- "sample"
imputedSets <- 20
MID <- mice( to.impute.tab, m = imputedSets, predictorMatrix = preds, method = meths, print = FALSE )
Inspecting the mixing of the MCMC chains:
plot( MID )
Shows no strong trend and good mixing of the chains.
The distribution of present data against imputed values shows they are within plausible ranges:
# - visualise imputed values
stripplot(MID,
data = P + N + G + yrsEduc + CGIsev + employFT + yrsFirstTx + yrsFrstAntiPsyRx~ .imp,
pch = 20, cex = 1.2)
For Table 2, we first run multiple univariate tests on each baseline/demographic variable to describe difference between included/excluded participants. This process is repeated for each of the imputed data sets as a sensitivity analysis.
# -- A function to compute differences in baseline / demographics between the missing (for TRS) and present groups
# This is built into a function because it's re-run for each imputed set (e.g. a sensitivity analysis).
diffBaseline <- function( descTab ) {
# -- build a vector describing eaach column in descTab as either nominal, binary or numeric data
dataType <- as.numeric( unlist( lapply(descTab, function(x) if (!is.numeric(x)) NA else max(x, na.rm = TRUE))) )
# remove missingFlag column
dataType <- dataType[ 1:(length( dataType ) - 1 ) ]
# parse dataType and decide on a relevant univariate test - e.g. Chi square for categorical proportions and t or KS for continuous
testFlag <- rep(NA, length( dataType ) )
for( i in 1:length( dataType ) ) {
if( dataType[i] == 1 | is.na( dataType[i] ) ) {
# categorical (factor) or binary : proportion test - Chi-square : code = 1
testFlag[i] <- 1
} else {
if( dataType[i] > 1 ) {
# continuous, therefore KS and t-test
testFlag[i] <- 2
}
}
}
# -- testFlag[i] == 1 instructs us to Chi-square for proportions (binary, categorical) or
# testFlag[i] == 2 instructs us to use either KS / t-test (depending on normality) for numerical / scale data
testTable <- data.frame( Var = rep("", length( dataType ) ),
M.Missing = rep(NA, length( dataType ) ),
Spread.Missing = rep(NA, length( dataType ) ),
M.Present = rep(NA, length( dataType ) ),
Spread.Present = rep(NA, length( dataType ) ),
Perc.Missing = rep(NA, length( dataType ) ),
N.one.missing = rep(NA, length( dataType ) ),
Perc.Present = rep(NA, length( dataType ) ),
N.one.present = rep(NA, length( dataType ) ),
P.Value = rep(NA, length( dataType ) ),
Test.Used = rep("", length( dataType ) ),
stringsAsFactors = FALSE
)
# Populate testTable, for all variables in testFlag ...
for( i in 1:length( testFlag ) ) {
thisVar <- names( descTab )[i]
testTable$Var[i] <- thisVar
# Select out data for missing overall group and non-missing group
grpMissing <- descTab[ which( descTab$missingFlag == 1 ), i ]
grpNotMissing <- descTab[ which( descTab$missingFlag == 0 ), i ]
# if for thisVar, testFlag == 1, it's ordinal / categorical data
if( testFlag[i] == 1 ) {
test.tab <- rbind( table( grpNotMissing ), table( grpMissing ) )
rownames(test.tab) <- c("Present","Missing")
suppressWarnings(
xs <- chisq.test( test.tab )
)
# -- absolute numbers with level = 1 for this variable in PRESENT (included) group
testTable$N.one.present[i] <- length( which( grpNotMissing == 1 ) )
testTable$N.one.missing[i] <- length( which( grpMissing == 1 ) )
testTable$Perc.Present[i] <- round( length( which( grpNotMissing == 1 ) ) / length( grpNotMissing ) * 100, 1 ) ## proportion of 1's to 0s in subjects with present data
testTable$Perc.Missing[i] <- round( length( which( grpMissing == 1 ) ) / length( grpMissing ) * 100, 1 )
## proportion of 1's to 0s in subjects with Missing data
testTable$P.Value[i] <- round( xs$p.value, 3 )
testTable$Test.Used[i] <- "ChiSq"
} else {
# thisVar is continuous, testFlag == 0
# We need either KS and t-test
# test normality with Shapiro-Wilks
this.SW.test.missing <- shapiro.test( grpMissing )
this.SW.test.notmissing <- shapiro.test( grpNotMissing )
if( this.SW.test.missing$p.value < 0.05 | this.SW.test.notmissing$p.value < 0.05 ) {
# if p.value of either SW test < 0.05, then use KS (i.e. at least one distribution is non-normal)
suppressWarnings(
this.test <- ks.test( grpMissing, grpNotMissing, alternative = "two.sided" )
)
} else {
# both missing and non-missing data are normal enough
suppressWarnings(
this.test <- t.test( grpMissing, grpNotMissing, alternative = "two.sided" )
)
}
if( this.test$method == "Two-sample Kolmogorov-Smirnov test" ) {
# record median and IQR
testTable$M.Missing[i] <- median( grpMissing, na.rm = TRUE )
testTable$Spread.Missing[i] <- IQR( grpMissing, na.rm = TRUE )
testTable$M.Present[i] <- median( grpNotMissing, na.rm = TRUE )
testTable$Spread.Present[i] <- IQR( grpNotMissing, na.rm = TRUE )
testTable$P.Value[i] <- round( this.test$p.value, 3 )
testTable$Test.Used[i] <- "KS"
}
if( this.test$method == "Welch Two Sample t-test" ) {
# record median and IQR
testTable$M.Missing[i] <- mean( grpMissing, na.rm = TRUE )
testTable$Spread.Missing[i] <- sd( grpMissing, na.rm = TRUE )
testTable$M.Present[i] <- mean( grpNotMissing, na.rm = TRUE )
testTable$Spread.Present[i] <- sd( grpNotMissing, na.rm = TRUE )
testTable$P.Value[i] <- round( this.test$p.value, 3 )
testTable$Test.Used[i] <- "T"
}
}
}
# add a significance flag (purely cosmetic)
testTable$signif <- ifelse( testTable$P.Value < 0.05, "*", " " )
return( testTable )
}
# -- a list of statistically significant differences between : missing data for TRS and present
# We check each imputed set as a sensitivity analysis to find the largest set of
# differences.
listSigVars <- vector("list", imputedSets)
# -- variables we DO NOT want tested
dontAnalyse <- c("ID", "time.inTrial", "onset.Sx", "onset.SOF", "onset.Rx", "time.onset.TRS",
"time.TRS", "numAdeq", "durAdeq", "totalRx", "status.rx", "status.sx", "status.sof",
"status.TRS", "TRS.pos", "TRS.neg", "missing.Sx", "missing.Rx", "missing.SOF",
"time.inTrialYrs", "time.onset.TRSYrs","time.TRSYrs", "final.TRS", "eventTime.TRS", "group.AT" )
for( i in 1:imputedSets ) {
# -- for each imputed set, establish differences in baseline data for missing for TRS / present for TRS groups (see tree diagram above)
thisSet <- complete( MID, i )
diffTab <- diffBaseline( thisSet[ , -which( names( thisSet ) %in% dontAnalyse ) ] )
# -- keep the significant variables
listSigVars[[i]] <- diffTab$Var[ which( diffTab$P.Value < 0.05 ) ]
}
If imputation makes no difference to the baseline/demographic variables that are different between included/excluded participants, then the variables that are significantly different (between included/excluded participants) should be consistent across all 20 imputed sets. We can check this by tabulating the number of times significantly-different variables occur over the imputed sets; complete consistency would be each significantly-different variable occuring 20 times:
kable( table( unlist( listSigVars ) ) )
| Var1 | Freq |
|---|---|
| alcAbuse_5yrs | 20 |
| alcDep_5yrs | 20 |
| drugDep_5yrs | 20 |
| exac3mo | 20 |
| otherB0 | 20 |
| race | 20 |
| STI | 20 |
| yrsFirstTx | 17 |
This tells us that across imputations, the same baseline variables differ (in the included versus excluded participants) across imputated sets – i.e. the baseline differences between included/excluded groups are not sensitive to the small proportion of missing data.
Finally, we display the multiple univariate differences for the source data (which has missing values for some demographic variables) confident that multiple imputation makes little difference to the significant differences in baseline variables.
With some annotation, the table of multiple univariate differences is as follows:
# -- to.impute.tab is the complete data set with missing data, used as the basis
# for imputation.
testTable <- diffBaseline( to.impute.tab[ , -which( names( to.impute.tab ) %in% dontAnalyse ) ] )
insertRow <- function(existingDF, newrow, r) {
existingDF <- rbind(existingDF,newrow)
existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
row.names(existingDF) <- 1:nrow(existingDF)
return(existingDF)
}
# organise and select vars in grouping/order from Lieberman et al (2005) paper (Table 1)
rowOrder <- c(4,5,20,7,21,9,6,1,2,3,8,10,11,31:37,12:19,23,25,26,24,27:30)
testTable.display <- testTable[ rowOrder, ]
# -- tidy labels
testTable.display$Var[1] <- " Age"
testTable.display$Var[2] <- " Sex (Male)"
testTable.display$Var[3] <- " Race"
testTable.display$Var[4] <- " Education (Yrs)"
testTable.display$Var[5] <- " Marital Status"
testTable.display$Var[6] <- " Employment (FT)"
testTable.display$Var[7] <- " Exacerbation (past 3 months)"
testTable.display$Var[8] <- " Positive"
testTable.display$Var[9] <- " Negative"
testTable.display$Var[10]<- " General"
testTable.display$Var[11]<- " CGI Severity"
testTable.display$Var[12]<- " For Behaviour/Emotional Problem"
testTable.display$Var[13]<- " With Antipsychotic Medication"
testTable.display$Var[14]<- " Depression"
testTable.display$Var[15]<- " Alcohol Dependency"
testTable.display$Var[16]<- " Alcohol Abuse"
testTable.display$Var[17]<- " Drug Dependency"
testTable.display$Var[18]<- " Drug Abuse"
testTable.display$Var[19]<- " OCD"
testTable.display$Var[20]<- " Anxiety"
testTable.display$Var[21]<- " Olanzapine"
testTable.display$Var[22]<- " Quetiapine"
testTable.display$Var[23]<- " Risperidone"
testTable.display$Var[24]<- " Ziprasidone"
testTable.display$Var[25]<- " Haloperidol"
testTable.display$Var[26]<- " Decanoate"
testTable.display$Var[27]<- " Perphenazine"
testTable.display$Var[28]<- " Other Rx"
testTable.display$Var[29]<- " Diabetes"
testTable.display$Var[30]<- " Hyperlipidaemia"
testTable.display$Var[31]<- " Hypertension"
testTable.display$Var[32]<- " Hepatitis (ABC)"
testTable.display$Var[33]<- " Isch. Heart Disease"
testTable.display$Var[34]<- " Osteoarthritis"
testTable.display$Var[35]<- " Osteoporosis"
testTable.display$Var[36]<- " STI"
# insert group labels
testTable.display <- insertRow( testTable.display, c("Demographics",rep(NA,9)), 1)
testTable.display <- insertRow( testTable.display, c("PANSS",rep(NA,9)), 9)
testTable.display <- insertRow( testTable.display, c("Psychiatric History",rep(NA,9)), 14)
testTable.display <- insertRow( testTable.display, c("Years Since First Treatment",rep(NA,9)), 15)
testTable.display <- insertRow( testTable.display, c("SCID diagnosis (past 5 years)",rep(NA,9)), 18)
testTable.display <- insertRow( testTable.display, c("Baseline Antipsychotic Medication",rep(NA,9)), 26)
testTable.display <- insertRow( testTable.display, c("Baseline Medical Diagnoses",rep(NA,9)), 35)
names( testTable.display ) <- c(
"Variable",
"Median",
"IQR",
"Median",
"IQR",
"Percentage1",
"N1",
"Percentage2",
"N2",
"P-value",
"Test",
"P < 0.05"
)
kable(testTable.display, "html") %>%
kable_styling("striped", full_width = F) %>%
row_spec(c(1,9,14,15,18,26,35), bold = T, color = "black", background = "#bdbdbd") %>%
add_header_above(c(" " = 1, "Excluded (Missing Data)" = 2, "Included" = 2,
"Excluded (Missing Data)" = 2, "Included" = 2, " " = 3))
| Variable | Median | IQR | Median | IQR | Percentage1 | N1 | Percentage2 | N2 | P-value | Test | P < 0.05 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Demographics | Demographics | ||||||||||
| Age | 39.5 | 16 | 42 | 17 | 0.225 | KS | |||||
| Sex (Male) | 79.1 | 87 | 73.6 | 982 | 0.252 | ChiSq | |||||
| Race | 0 | 0 | 0 | 0 | 0.004 | ChiSq |
|
||||
| Education (Yrs) | 12 | 1 | 12 | 1.5 | 0.804 | KS | |||||
| Marital Status | 0 | 0 | 0 | 0 | 0.872 | ChiSq | |||||
| Employment (FT) | 5.5 | 6 | 6.8 | 91 | 0.729 | ChiSq | |||||
| Exacerbation (past 3 months) | 36.4 | 40 | 26.8 | 357 | 0.04 | ChiSq |
|
||||
| PANSS | PANSS | ||||||||||
| Positive | 19 | 7.75 | 18 | 8 | 0.344 | KS | |||||
| Negative | 20 | 10 | 20 | 8 | 0.964 | KS | |||||
| General | 36.5 | 13 | 37 | 13 | 0.877 | KS | |||||
| CGI Severity | 4 | 1 | 4 | 1 | 0.998 | KS | |||||
| Psychiatric History | Psychiatric History | ||||||||||
| Years Since First Treatment | Years Since First Treatment | ||||||||||
| For Behaviour/Emotional Problem | 15 | 13 | 15 | 18 | 0.033 | KS |
|
||||
| With Antipsychotic Medication | 13 | 15 | 12 | 18 | 0.104 | KS | |||||
| SCID diagnosis (past 5 years) | SCID diagnosis (past 5 years) | ||||||||||
| Depression | 29.1 | 32 | 27.5 | 367 | 0.806 | ChiSq | |||||
| Alcohol Dependency | 22.7 | 25 | 13.5 | 180 | 0.012 | ChiSq |
|
||||
| Alcohol Abuse | 25.5 | 28 | 17.4 | 232 | 0.047 | ChiSq |
|
||||
| Drug Dependency | 25.5 | 28 | 16 | 214 | 0.016 | ChiSq |
|
||||
| Drug Abuse | 27.3 | 30 | 19.9 | 265 | 0.084 | ChiSq | |||||
| OCD | 2.7 | 3 | 5.2 | 69 | 0.366 | ChiSq | |||||
| Anxiety | 17.3 | 19 | 13.3 | 178 | 0.313 | ChiSq | |||||
| Baseline Antipsychotic Medication | Baseline Antipsychotic Medication | ||||||||||
| Olanzapine | 22.7 | 25 | 27 | 360 | 0.39 | ChiSq | |||||
| Quetiapine | 9.1 | 10 | 9.9 | 132 | 0.916 | ChiSq | |||||
| Risperidone | 21.8 | 24 | 22.6 | 302 | 0.937 | ChiSq | |||||
| Ziprasidone | 1.8 | 2 | 5.1 | 68 | 0.191 | ChiSq | |||||
| Haloperidol | 6.4 | 7 | 5.8 | 77 | 0.966 | ChiSq | |||||
| Decanoate | 0.9 | 1 | 1.6 | 21 | 0.887 | ChiSq | |||||
| Perphenazine | 0.9 | 1 | 2.2 | 29 | 0.585 | ChiSq | |||||
| Other Rx | 0.9 | 1 | 8.9 | 119 | 0.006 | ChiSq |
|
||||
| Baseline Medical Diagnoses | Baseline Medical Diagnoses | ||||||||||
| Diabetes | 10 | 11 | 10.7 | 143 | 0.941 | ChiSq | |||||
| Hyperlipidaemia | 11.8 | 13 | 14.3 | 191 | 0.561 | ChiSq | |||||
| Hypertension | 23.6 | 26 | 19.7 | 263 | 0.388 | ChiSq | |||||
| Hepatitis (ABC) | 10 | 11 | 5.8 | 77 | 0.115 | ChiSq | |||||
| Isch. Heart Disease | 0.9 | 1 | 0.8 | 11 | 1 | ChiSq | |||||
| Osteoarthritis | 2.7 | 3 | 5.2 | 70 | 0.351 | ChiSq | |||||
| Osteoporosis | 0.9 | 1 | 0.7 | 10 | 1 | ChiSq | |||||
| STI | 2.7 | 3 | 0.5 | 7 | 0.038 | ChiSq |
|
# -- latex version :
ltxTbl <- testTable.display[ , -which( colnames( testTable.display ) %in%
c("Test", "P < 0.05", "P-value","Percentage1", "N1",
"Percentage2","N2")
)
]
# -- merge "Percentage" and "N" columns for display
temp.NP.Exc <- paste0( testTable.display[,"Percentage1"], " (", testTable.display[,"N1"], ")")
temp.NP.Inc <- paste0( testTable.display[,"Percentage2"], " (", testTable.display[,"N2"], ")")
temp.NP.Exc[ which( ( str_match( temp.NP.Exc, "NA") == "NA" ) ) ] <- ""
temp.NP.Inc[ which( ( str_match( temp.NP.Inc, "NA") == "NA" ) ) ] <- ""
# -- merge P val and * flag
temp.p <- paste( testTable.display[,"P-value"], testTable.display[,"P < 0.05"])
# -- replace NAs with blank strings
temp.p[ which( ( str_match( temp.p, "NA") == "NA" ) ) ] <- ""
ltxTbl$Percent.Exc <- temp.NP.Exc
ltxTbl$Percent.Inc <- temp.NP.Inc
ltxTbl$P.value <- temp.p
colnames(ltxTbl) <- c("Variable","Median","IQR","Median","IQR","Percent (N)","Percent (N)","P Value")
# -- because race and martial are multi-level (not binary) we need to blank the
# -- percentages and report separately
ltxTbl[ which( ltxTbl$Variable %in% c(" Marital Status") ) , 6:7 ] <- NA
ltxTbl[ which( ltxTbl$Variable %in% c(" Race") ) , 6:7 ] <- NA
kable(ltxTbl, "latex", booktabs = TRUE) %>%
kable_styling(font_size = 6) %>%
row_spec(c(1,9,14,15,18,26,35), bold = T, color = "black", background = "#bdbdbd") %>%
add_header_above(c(" " = 1, "Excluded" = 2, "Included" = 2,
"Excluded" = 1, "Included" = 1, " " = 1))
Race appears to differ (for those excluded for missing data compared to the group included in analyses) and as race is nominal with 3 levels, we need more detail than provided in the preceding table.
levels( to.impute.tab$race )
## [1] "Black" "Other" "White"
# -- explore proportion of missing cases by racial group
raceProp <- xtabs( ~ to.impute.tab$race + to.impute.tab$missingFlag )
colnames( raceProp ) <- c("Included","Excluded")
raceProp <- raceProp[,c(2,1)]
kable(raceProp, "html") %>%
kable_styling("striped", full_width = F)
| Excluded | Included | |
|---|---|---|
| Black | 52 | 451 |
| Other | 8 | 64 |
| White | 50 | 819 |
kable(raceProp, "latex", booktabs = TRUE) %>%
kable_styling(font_size = 6)
Similarly, for marital status:
# -- explore proportion of missing cases by marital status
maritalProp <- xtabs( ~ to.impute.tab$marital + to.impute.tab$missingFlag )
colnames( maritalProp ) <- c("Included","Excluded")
kable(maritalProp, "html") %>%
kable_styling("striped", full_width = F)
| Included | Excluded | |
|---|---|---|
| Married | 153 | 13 |
| NeverMarried | 797 | 63 |
| PrevMarried | 384 | 34 |
kable(maritalProp, "latex", booktabs = TRUE) %>%
kable_styling(font_size = 6)
# -- the imputed baseline data
save(MID, file = "../DerivedData/imputed_tabSurv.RData")
cat("Execution time")
## Execution time
Sys.time() - startTime
## Time difference of 43.52555 secs